Downloading Python Modules and Libraries.

library(reticulate)
use_python("/usr/bin/python3", required = F)
#py_install("pandas")
#py_install("seaborn")
#py_install("matplotlib")
#py_install("numpy")
#py_install("pandas_profiling")
#py_install("scikit-learn")

Importing modules and Dataset for Use.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

heart = pd.read_csv('heart.csv')
heart['HeartDisease'] = heart['HeartDisease'].astype('bool')

Basic Visualization and Statistics

plt.hist('Age', data = heart, color = 'black',bins = 30)
## (array([ 4.,  3.,  5.,  9., 17., 11., 31., 37., 18., 43., 18., 43., 52.,
##        25., 71., 84., 41., 76., 77., 32., 66., 30., 43., 28., 10., 20.,
##         9.,  1., 10.,  4.]), array([28.        , 29.63333333, 31.26666667, 32.9       , 34.53333333,
##        36.16666667, 37.8       , 39.43333333, 41.06666667, 42.7       ,
##        44.33333333, 45.96666667, 47.6       , 49.23333333, 50.86666667,
##        52.5       , 54.13333333, 55.76666667, 57.4       , 59.03333333,
##        60.66666667, 62.3       , 63.93333333, 65.56666667, 67.2       ,
##        68.83333333, 70.46666667, 72.1       , 73.73333333, 75.36666667,
##        77.        ]), <a list of 30 Patch objects>)
plt.title('Distribution of Ages in Sample')
plt.ylabel('Frequency')
plt.xlabel('Ages (in Years)')
plt.show()

plt.scatter(heart['Cholesterol'],heart['MaxHR'], c = heart['MaxHR'], cmap = 'YlGnBu')
plt.title("Cholesterol vs Max Heart Rate")
plt.xlabel('Cholesterol')
plt.ylabel('MaxHR')
plt.show()

plot = sns.boxplot(x='ChestPainType', y='Cholesterol', data=heart, hue='ChestPainType')
plt.title('Cholesterol Levels Among Chest Pain Types')
plt.xlabel('Chest Pain Types')
plt.ylabel('Cholesterol')
plt.show()

Supervised Learning Methods (Logistic & Linear Regression)

import sklearn.model_selection
from sklearn.model_selection import train_test_split

x = heart.loc[:,['Cholesterol','MaxHR','Age']]
y = heart.loc[:,['HeartDisease']]

x_train, x_test, y_train, y_test = train_test_split(x,y)

Multivariate Linear Regression

print("Linear Regression")
## Linear Regression
import sklearn.linear_model
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train)
## LinearRegression()
print("Linear Regression Model Equation")
## Linear Regression Model Equation
print(model.coef_)
## [[-0.00064462 -0.00576024  0.00854198]]
print(model.intercept_)
## [1.01252496]
print("Linear Regression Model Accuracy")
## Linear Regression Model Accuracy
print(model.score(x_test, y_test))
## 0.20378232250799166
prediction = model.predict(x_test)

Logistic Regression

print("Logistic Regression")
## Logistic Regression
import sklearn.linear_model
from sklearn.linear_model import LogisticRegression

x_test = x_test.to_numpy()
x_train = x_train.to_numpy()

y_train = y_train.values.flatten()
y_test = y_test.values.flatten()

model = LogisticRegression()
model.fit(x_train, y_train)
## LogisticRegression()
print("Logistic Regression Model Equation")
## Logistic Regression Model Equation
print(model.coef_)
## [[-0.00338517 -0.02874657  0.04110146]]
print(model.intercept_)
# model.predict(x_test)
## [2.68949655]
print("Logistic Regression Model Accuracy")
## Logistic Regression Model Accuracy
print(model.score(x_test, y_test))
## 0.7130434782608696

Visualizations for single variable linear and logistic regression.

sns.lmplot(x="Age", y="Cholesterol", data=heart, x_jitter=.05, scatter_kws={"color": "lightgreen"}, line_kws={"color": "blue"});

plt.show()

sns.regplot(x = heart['Cholesterol'], y = heart['HeartDisease'], logistic = True, ci = False, scatter_kws={"color": "lightgreen"}, line_kws={"color": "blue"});

plt.show()

Decision Trees

import sklearn.tree
from sklearn.tree import DecisionTreeClassifier

import sklearn.model_selection
from sklearn.model_selection import train_test_split

x = heart.loc[:,['Cholesterol','MaxHR','Age']]
y = heart.loc[:,['HeartDisease']]

x_train, x_test, y_train, y_test = train_test_split(x,y)

x = range(1,30,5)
y = []
print("Loop to find Optimal Number of Splits")
## Loop to find Optimal Number of Splits
for i in range(1,30,5) :
    Tree = DecisionTreeClassifier(max_depth = i, random_state = 2)
    Tree.fit(x_train, y_train)
    y.append(Tree.score(x_test, y_test))
## DecisionTreeClassifier(max_depth=1, random_state=2)
## DecisionTreeClassifier(max_depth=6, random_state=2)
## DecisionTreeClassifier(max_depth=11, random_state=2)
## DecisionTreeClassifier(max_depth=16, random_state=2)
## DecisionTreeClassifier(max_depth=21, random_state=2)
## DecisionTreeClassifier(max_depth=26, random_state=2)
plt.plot(x,y, color = 'gray')
plt.title('Optimal Number of Splits')
plt.xlabel('Number of Splits')
plt.ylabel('Accuracy')
plt.show()

print("We can see from the graph that the optimal number of splits is around 6. However, this makes it that the graph is hard to read. Therefore the tree below has two splits.")
## We can see from the graph that the optimal number of splits is around 6. However, this makes it that the graph is hard to read. Therefore the tree below has two splits.
from sklearn import tree

features = x
target = y

Tree = DecisionTreeClassifier(max_depth = 2, random_state = 2)
Tree.fit(x_train, y_train)
## DecisionTreeClassifier(max_depth=2, random_state=2)
print(Tree.score(x_test, y_test))
## 0.7217391304347827
plt.figure(figsize = (50,50))
tree.plot_tree(Tree, filled = True)
plt.show()

PCA (Principle Compenent Analysis)

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
heart = heart[['Age','Cholesterol','RestingBP', 'HeartDisease']]
dataset = heart.select_dtypes(include = 'number')
Standardize = StandardScaler()

Standardize.fit_transform(dataset)
## array([[-1.4331398 ,  0.82507026,  0.41090889],
##        [-0.47848359, -0.17196105,  1.49175234],
##        [-1.75135854,  0.7701878 , -0.12951283],
##        ...,
##        [ 0.37009972, -0.62016778, -0.12951283],
##        [ 0.37009972,  0.34027522, -0.12951283],
##        [-1.64528563, -0.21769643,  0.30282455]])
PCA_Heart = PCA(n_components = 3) 
PCs = PCA_Heart.fit_transform(dataset)
PCA_Heart.explained_variance_ratio_
## array([0.9655264 , 0.02802355, 0.00645005])
plt.scatter(PCs[:,0], PCs[:,1], s = 1000, c = heart['HeartDisease'], cmap = 'Spectral')

plt.title('PCA of Heart Disease', fontsize = 100)
plt.xlabel('First Principal Component', fontsize = 90)
plt.ylabel('Second Principal Component', fontsize = 90)
plt.axis(fontsize = 60)
## (-231.35541358390446, 434.33735633766776, -81.45668923736882, 136.84535729694852)
## 
## /Users/filinanurcahya/Library/r-miniconda/envs/r-reticulate/bin/python:1: MatplotlibDeprecationWarning: Passing unsupported keyword arguments to axis() will raise a TypeError in 3.3.
plt.xticks(fontsize = 60)
## (array([-300., -200., -100.,    0.,  100.,  200.,  300.,  400.,  500.]), <a list of 9 Text major ticklabel objects>)
plt.yticks(fontsize = 60)
## (array([-100.,  -75.,  -50.,  -25.,    0.,   25.,   50.,   75.,  100.,
##         125.,  150.]), <a list of 11 Text major ticklabel objects>)
plt.show()